Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1129c25c0>
test_obj.df[['GHI', 'GHI mean', 'GHI gauss10 mean', 'GHI gauss1 mean', 'GHI gauss.1 mean', 'GHI gauss.5 mean', 'GHI gauss.01 mean']].iplot()
Clean data by setting some cutoffs (by hand). For this study, GHI/GHIcs mean has to be >= .9 and the coefficient of variance must be less than .1. Also need to limit GHI-GHIcs (due to low irradiance periods) to
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1139515f8>
fig, ax = plt.subplots(figsize=(24, 8))
nsrdb_mask = test_obj.df['sky_status'].values
ax.plot(test_obj.df.index, test_obj.df['GHI'], label='GHI', alpha=.5)
ax.plot(test_obj.df.index, test_obj.df['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
ax.scatter(test_obj.df[nsrdb_mask & ~pred].index, test_obj.df[nsrdb_mask & ~pred]['GHI'], label='NSRDB only', zorder=10, s=50)
ax.scatter(test_obj.df[pred & ~nsrdb_mask].index, test_obj.df[pred & ~nsrdb_mask]['GHI'], label='RF only', zorder=10, s=50)
ax.scatter(test_obj.df[nsrdb_mask & pred].index, test_obj.df[nsrdb_mask & pred]['GHI'], label='Both', zorder=10, s=50)
_ = ax.legend(bbox_to_anchor=(1.25, 1))
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
Feature importance and correlation investigation¶
The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.
Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 344, in day_prediction
calc_all_window_metrics(day, window, meas_col=meas_col, model_col=model_col, overwrite=overwrite)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 62, in calc_all_window_metrics
calc_line_length(df, meas_col, window, dx, label=label_ll_1, overwrite=overwrite)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 286, in calc_line_length
df[label] = df[col].rolling(window, center=True).apply(lambda x: line_length(x, dx)).fillna(0)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 352, in day_prediction
clear_model = day[y_pred][model_col]
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 1207, in apply
return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 1958, in __getitem__
return self._getitem_array(key)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 856, in apply
center=False)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 801, in _apply
result = calc(values)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 2000, in _getitem_array
return self.take(indexer, axis=0, convert=False)
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 1928, in take
convert=True, verify=True)
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 795, in calc
closed=self.closed)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 4011, in take
axis=axis, allow_dups=True)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3901, in reindex_indexer
return self.__class__(new_blocks, new_axes)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 853, in f
offset, func, args, kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 2797, in __init__
self._consolidate_check()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3305, in _consolidate_check
ftypes = [blk.ftype for blk in self.blocks]
File "pandas/_libs/window.pyx", line 1462, in pandas._libs.window.roll_generic (pandas/_libs/window.c:36229)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3305, in <listcomp>
ftypes = [blk.ftype for blk in self.blocks]
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 309, in ftype
return "%s:%s" % (self.dtype, self._ftype)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
self._terminate_backend()
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 286, in <lambda>
df[label] = df[col].rolling(window, center=True).apply(lambda x: line_length(x, dx)).fillna(0)
KeyboardInterrupt
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 304, in line_length
ydiffs = np.diff(x)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
self._backend.terminate()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
self._pool.terminate() # terminate does a join()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/numpy/lib/function_base.py", line 1926, in diff
return a[slice1]-a[slice2]
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__
self.retrieve()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
self._terminate()
KeyboardInterrupt
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve
raise exception
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
worker_handler.join()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 682, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 602, in get
self.wait(timeout)
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
self._wait_for_tstate_lock()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 599, in wait
self._event.wait(timeout)
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 549, in wait
signaled = self._cond.wait(timeout)
KeyboardInterrupt
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 293, in wait
waiter.acquire()
KeyboardInterrupt
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-32-627439ef4ce4> in <module>()
9 clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
10 clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
---> 11 pred = test_obj.predict(clf)
12 test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
13 f1_score = metrics.f1_score(test_obj.get_masked_df()['sky_status'], pred[test_obj.df['quality_mask']])
~/duramat/clearsky_detection/cs_detection.py in predict(self, *args, **kwargs)
691
692 def predict(self, *args, **kwargs):
--> 693 return self.iter_predict_daily(*args, **kwargs)
694
695 def downsample(self, min_freq):
~/duramat/clearsky_detection/cs_detection.py in iter_predict_daily(self, clf, n_iter, tol, ml_label, overwrite, multiproc, by_day)
678 )
679 pool.close()
--> 680 pool.join()
681 else:
682 final_dfs = [utils.day_prediction(day, clf, self.features_, self.window, self.meas_col, self.model_col,
~/miniconda3/lib/python3.5/multiprocessing/pool.py in join(self)
508 util.debug('joining pool')
509 assert self._state in (CLOSE, TERMINATE)
--> 510 self._worker_handler.join()
511 self._task_handler.join()
512 self._result_handler.join()
~/miniconda3/lib/python3.5/threading.py in join(self, timeout)
1052
1053 if timeout is None:
-> 1054 self._wait_for_tstate_lock()
1055 else:
1056 # the behavior of a negative timeout isn't documented, but
~/miniconda3/lib/python3.5/threading.py in _wait_for_tstate_lock(self, block, timeout)
1068 if lock is None: # already determined that the C code is done
1069 assert self._is_stopped
-> 1070 elif lock.acquire(block, timeout):
1071 lock.release()
1072 self._stop()
KeyboardInterrupt:
Process ForkPoolWorker-14:
Process ForkPoolWorker-13:
Process ForkPoolWorker-15:
Process ForkPoolWorker-16:
Traceback (most recent call last):
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
Traceback (most recent call last):
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
self._terminate_backend()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
self._terminate_backend()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
self._backend.terminate()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
self._terminate_backend()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
self._backend.terminate()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
self._pool.terminate() # terminate does a join()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
self._backend.terminate()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
self._pool.terminate() # terminate does a join()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
self._terminate()
File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
y_pred = clf.predict(X)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
self._terminate()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
self._pool.terminate() # terminate does a join()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
self._terminate()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
proba = self.predict_proba(X)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
worker_handler.join()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
for e in self.estimators_)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
worker_handler.join()
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
self._wait_for_tstate_lock()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
worker_handler.join()
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
self._wait_for_tstate_lock()
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
self._terminate_backend()
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
self._wait_for_tstate_lock()
KeyboardInterrupt
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
self._backend.terminate()
KeyboardInterrupt
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
self._pool.terminate() # terminate does a join()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
self._terminate()
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
worker_handler.join()
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
self._wait_for_tstate_lock()
File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
Full features
F1: 0.9866607381058248
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean')
F1: 0.9856331440026729
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std')
F1: 0.9577911055788164
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean')
F1: 0.9385054192812322
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std')
F1: 0.9371298405466971
features: ('GHILL-GHIcsLL', 'abs(t-tnoon)')
F1: 0.7715979316433346
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std')
F1: 0.985423389340158
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean')
F1: 0.9853745673774701
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std')
F1: 0.9849649181423321
features: ('abs(GHI-GHIcs) mean', 'abs(t-tnoon)')
F1: 0.9841906034290804
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9587466185752931
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9558723693143246
features: ('abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.8895328619450562
features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9494128274616079
features: ('dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.7775988105563127
features: ('dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9019607843137255
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std')
F1: 0.9864263462394304
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean')
F1: 0.9869608826479438
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std')
F1: 0.985510477039679
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(t-tnoon)')
F1: 0.9859563085153812
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9683237515499944
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9661573288058857
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.9654549559720027
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9620510503727129
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9518619436875567
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9541180469015521
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9849749582637729
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.985552344965548
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.986
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.985087914533719
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9870881567230633
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9863044204431579
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9650979509119567
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9652144545761566
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9621938833088816
features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9511643680759666
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9868801423171003
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9853300733496334
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.9868830591373944
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9862191598132919
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9873164218958611
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.986289153940475
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9702547985183522
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9708563069652302
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9679241021007453
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.966884433430953
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.986
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9862160960426856
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9852074296518742
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9860879243183084
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9699628754640567features = ['GHI-GHIcs mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)']
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
train_obj.features_ = features
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
test_obj.features_ = features
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
print(f1_score)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))
# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])
CV scoring and model selection (2010-2015 only)¶
detect_obj.df[['GHI', 'GHI mean', 'GHI gauss10 mean', 'GHI gauss1 mean', 'GHI gauss.1 mean']].iplot()
The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.
features = [
# 'GHILL-GHIcsLL',
'GHI-GHIcs mean',
'GHI-GHIcs std',
'dGHI-dGHIcs mean',
# 'dGHI-dGHIcs std',
'abs(t-tnoon)']detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
train_obj.features_ = features
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
test_obj.features_ = features
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
print('Full features'.format([i for i in features]))
print(' F1: {}'.format(f1_score))
vis = visualize.Visualizer()
vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_)
# for i in range(3, len(features) - 1):
# for feat_set in itertools.combinations(features, i):
# detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
# detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
# train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
# train_obj.trim_dates(None, '01-01-2015')
# train_obj.features_ = list(feat_set)
# test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
# test_obj.trim_dates('01-01-2015', None)
# test_obj.features_ = list(feat_set)
# clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
# clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
# pred = test_obj.predict(clf)
# test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
# f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
# print('features: {}'.format(feat_set))
# print(' F1: {}'.format(f1_score))cm = metrics.confusion_matrix(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))fig, ax = plt.subplots(figsize=(12, 8))
_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)
_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')
_ = fig.tight_layout()# fig, ax = plt.subplots(figsize=(12, 8))
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))
# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])Full features
F1: 0.9866607381058248
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean')
F1: 0.9856331440026729
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std')
F1: 0.9577911055788164
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean')
F1: 0.9385054192812322
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std')
F1: 0.9371298405466971
features: ('GHILL-GHIcsLL', 'abs(t-tnoon)')
F1: 0.7715979316433346
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std')
F1: 0.985423389340158
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean')
F1: 0.9853745673774701
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std')
F1: 0.9849649181423321
features: ('abs(GHI-GHIcs) mean', 'abs(t-tnoon)')
F1: 0.9841906034290804
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9587466185752931
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9558723693143246
features: ('abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.8895328619450562
features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9494128274616079
features: ('dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.7775988105563127
features: ('dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9019607843137255
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std')
F1: 0.9864263462394304
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean')
F1: 0.9869608826479438
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std')
F1: 0.985510477039679
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(t-tnoon)')
F1: 0.9859563085153812
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9683237515499944
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9661573288058857
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.9654549559720027
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9620510503727129
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9518619436875567
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9541180469015521
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9849749582637729
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.985552344965548
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.986
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.985087914533719
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9870881567230633
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9863044204431579
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9650979509119567
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9652144545761566
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9621938833088816
features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9511643680759666
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean')
F1: 0.9868801423171003
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std')
F1: 0.9853300733496334
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)')
F1: 0.9868830591373944
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9862191598132919
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9873164218958611
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.986289153940475
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.9702547985183522
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9708563069652302
features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9679241021007453
features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.966884433430953
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std')
F1: 0.986
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)')
F1: 0.9862160960426856
features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9852074296518742
features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9860879243183084
features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)')
F1: 0.9699628754640567features = ['GHI-GHIcs mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)']
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
train_obj.features_ = features
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
test_obj.features_ = features
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
print(f1_score)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))
# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])